"""
Phase 10: BIS Bank Lending and Demographics
=============================================
Tests whether demographics predict cross-border bank lending positions,
using BIS Locational Banking Statistics (LBS).

Key question: Does adding bank lending data to the net/gross paper
change the core findings?
- Income balance dominance (Z₁=41.4*** on income_bal, trade null)
- KAOPEN gates returns not positions
- S-I suppression (not mediation)
- FX reserves as strongest instrument signal

If demographics predict bank lending strongly, and bank lending is
excluded from the current IIP-based analysis, findings could be biased.

Data: BIS LBS (1977-2025, quarterly → annual Q4)
Output: table_bis_bank_lending.md
"""

import sys
import numpy as np
import pandas as pd
from pathlib import Path
import zipfile
import warnings
warnings.filterwarnings('ignore')

PROJECT_DIR = Path("/mnt/c/demographics_capital_flows/net_gross")
ROOT_DIR = PROJECT_DIR.parent
RAW_DIR = PROJECT_DIR / "data" / "raw"
PROCESSED_DIR = PROJECT_DIR / "data" / "processed"
TABLES_DIR = PROJECT_DIR / "output" / "tables"

for d in [TABLES_DIR]:
    d.mkdir(parents=True, exist_ok=True)

sys.path.insert(0, str(ROOT_DIR / "multilateral" / "src"))
from model import PanelGLS

# BIS 2-letter → ISO3 mapping (common countries)
BIS_TO_ISO3 = {
    'US': 'USA', 'GB': 'GBR', 'DE': 'DEU', 'FR': 'FRA', 'JP': 'JPN',
    'CA': 'CAN', 'AU': 'AUS', 'CH': 'CHE', 'NL': 'NLD', 'BE': 'BEL',
    'IT': 'ITA', 'ES': 'ESP', 'AT': 'AUT', 'SE': 'SWE', 'NO': 'NOR',
    'DK': 'DNK', 'FI': 'FIN', 'IE': 'IRL', 'PT': 'PRT', 'GR': 'GRC',
    'KR': 'KOR', 'CN': 'CHN', 'IN': 'IND', 'BR': 'BRA', 'MX': 'MEX',
    'ZA': 'ZAF', 'TR': 'TUR', 'RU': 'RUS', 'ID': 'IDN', 'MY': 'MYS',
    'TH': 'THA', 'PH': 'PHL', 'SG': 'SGP', 'HK': 'HKG', 'TW': 'TWN',
    'CL': 'CHL', 'CO': 'COL', 'PE': 'PER', 'AR': 'ARG', 'SA': 'SAU',
    'AE': 'ARE', 'IL': 'ISR', 'PL': 'POL', 'CZ': 'CZE', 'HU': 'HUN',
    'RO': 'ROU', 'BG': 'BGR', 'HR': 'HRV', 'LU': 'LUX', 'NZ': 'NZL',
    'EG': 'EGY', 'NG': 'NGA', 'PK': 'PAK', 'BD': 'BGD', 'VN': 'VNM',
    'KE': 'KEN', 'GH': 'GHA', 'TZ': 'TZA', 'MA': 'MAR', 'TN': 'TUN',
    'LB': 'LBN', 'JO': 'JOR', 'KW': 'KWT', 'QA': 'QAT', 'BH': 'BHR',
    'LT': 'LTU', 'LV': 'LVA', 'EE': 'EST', 'SK': 'SVK', 'SI': 'SVN',
    'CY': 'CYP', 'MT': 'MLT', 'IS': 'ISL',
    # Aggregates
    '5A': 'WORLD', '5R': 'DEV', '5J': 'ALL',
}


def load_bis_lbs():
    """Load BIS LBS data, filtering to the series we need."""
    cache_file = PROCESSED_DIR / "bis_bank_claims.csv"
    if cache_file.exists():
        print(f"  Loading cached: {cache_file}")
        return pd.read_csv(cache_file)

    zip_path = RAW_DIR / "WS_LBS_D_PUB_csv_col.zip"
    if not zip_path.exists():
        raise FileNotFoundError(f"BIS LBS ZIP not found at {zip_path}")

    print("  Reading BIS LBS (536 MB, this takes a moment) ...")

    # Filter criteria for cross-border bank claims by counterparty
    filters = {
        'L_MEASURE': 'S',          # Stocks
        'L_POSITION': 'C',         # Claims (assets)
        'L_INSTR': 'A',            # All instruments
        'L_DENOM': 'TO1',          # All currencies
        'L_CURR_TYPE': 'A',        # All currency types
        'L_PARENT_CTY': '5J',      # All parent countries (residence basis)
        'L_REP_BANK_TYPE': 'A',    # All reporting banks
        'L_CP_SECTOR': 'A',        # All sectors
        'L_POS_TYPE': 'N',         # Cross-border only
    }

    # Read in chunks to manage memory
    chunks = []
    with zipfile.ZipFile(zip_path) as z:
        with z.open('WS_LBS_D_PUB_csv_col.csv') as f:
            reader = pd.read_csv(f, chunksize=50000, low_memory=False)
            for i, chunk in enumerate(reader):
                # Apply filters
                mask = pd.Series(True, index=chunk.index)
                for col, val in filters.items():
                    if col in chunk.columns:
                        mask &= (chunk[col] == val)
                filtered = chunk[mask]
                if len(filtered) > 0:
                    chunks.append(filtered)
                if i % 50 == 0:
                    print(f"    Processed {(i+1)*50000:,} rows, kept {sum(len(c) for c in chunks):,} ...")

    if not chunks:
        print("  WARNING: No matching rows found!")
        return pd.DataFrame()

    df = pd.concat(chunks, ignore_index=True)
    print(f"  Filtered to {len(df):,} series")

    # Extract Q4 values for annual data (use Q4 as end-of-year stock)
    # Columns are like '1990-Q4', '2024-Q4', etc.
    q4_cols = [c for c in df.columns if c.endswith('-Q4')]
    id_cols = ['L_REP_CTY', 'L_CP_COUNTRY']

    # Melt to long format
    long = df[id_cols + q4_cols].melt(
        id_vars=id_cols,
        var_name='quarter',
        value_name='claims_usd_mn'
    )
    long['year'] = long['quarter'].str[:4].astype(int)
    long['claims_usd_mn'] = pd.to_numeric(long['claims_usd_mn'], errors='coerce')
    long = long.dropna(subset=['claims_usd_mn'])

    # Map BIS codes to ISO3
    long['reporter_iso3'] = long['L_REP_CTY'].map(BIS_TO_ISO3)
    long['counterparty_iso3'] = long['L_CP_COUNTRY'].map(BIS_TO_ISO3)

    # Keep only rows where both are mapped
    long = long.dropna(subset=['reporter_iso3', 'counterparty_iso3'])

    # Drop aggregates
    agg_codes = ['WORLD', 'DEV', 'ALL']
    long = long[~long['reporter_iso3'].isin(agg_codes)]
    long = long[~long['counterparty_iso3'].isin(agg_codes)]

    result = long[['reporter_iso3', 'counterparty_iso3', 'year', 'claims_usd_mn']].copy()
    result.to_csv(cache_file, index=False)
    print(f"  Saved: {cache_file} ({len(result):,} obs)")
    return result


def aggregate_to_country_year(bilateral):
    """Aggregate bilateral claims to country-year totals."""
    # Total claims by reporter (outward bank lending)
    outward = bilateral.groupby(['reporter_iso3', 'year'])['claims_usd_mn'].sum().reset_index()
    outward.columns = ['iso3', 'year', 'bank_claims_out_mn']

    # Total claims on counterparty (inward bank lending = other countries' claims on you)
    inward = bilateral.groupby(['counterparty_iso3', 'year'])['claims_usd_mn'].sum().reset_index()
    inward.columns = ['iso3', 'year', 'bank_claims_in_mn']

    merged = outward.merge(inward, on=['iso3', 'year'], how='outer')
    merged['net_bank_claims_mn'] = merged['bank_claims_out_mn'].fillna(0) - merged['bank_claims_in_mn'].fillna(0)

    return merged


def merge_with_panel(bank_agg):
    """Merge bank lending data with net_gross panel."""
    panel = pd.read_csv(PROCESSED_DIR / "net_gross_panel.csv")
    panel = panel[panel['year'] <= 2024]

    # Get GDP for normalization (need USD GDP in millions)
    # ngdp_usd is nominal GDP in USD (check units)
    if 'ngdp_usd' in panel.columns:
        # WEO ngdp_usd is typically in billions
        panel['gdp_usd_mn'] = panel['ngdp_usd'] * 1000  # convert to millions
    elif 'gdp_ppp' in panel.columns:
        panel['gdp_usd_mn'] = panel['gdp_ppp']  # rough proxy
    else:
        panel['gdp_usd_mn'] = np.nan

    merged = panel.merge(bank_agg, on=['iso3', 'year'], how='left')

    # Normalize by GDP (claims are in USD millions)
    for col in ['bank_claims_out_mn', 'bank_claims_in_mn', 'net_bank_claims_mn']:
        gdp_col = col.replace('_mn', '_gdp')
        merged[gdp_col] = merged[col] / merged['gdp_usd_mn']

    return merged


def run_tests(panel):
    """Run the key tests."""
    results = {}

    # ── Test 1: Do demographics predict bank lending positions? ──
    print("\n" + "=" * 60)
    print("TEST 1: Demographics → Bank Lending Positions")
    print("=" * 60)

    dvs = {
        'bank_claims_out_gdp': 'Outward bank claims/GDP',
        'bank_claims_in_gdp': 'Inward bank claims/GDP',
        'net_bank_claims_gdp': 'Net bank claims/GDP',
    }

    test1 = []
    for dv, label in dvs.items():
        if dv not in panel.columns:
            continue

        est = panel.dropna(subset=[dv, 'Z_1', 'Z_2']).copy()
        if len(est) < 50:
            print(f"  {label}: only {len(est)} obs, skipping")
            continue

        model = PanelGLS()
        y = est[dv].values
        X = est[['Z_1', 'Z_2']].values
        model.fit(y, X, est['iso3'].values, est['year'].values)

        stars = '***' if model.pvalues[0] < 0.01 else ('**' if model.pvalues[0] < 0.05 else ('*' if model.pvalues[0] < 0.1 else ''))
        print(f"  {label}: Z₁={model.beta[0]:.4f}{stars} (p={model.pvalues[0]:.3f}), "
              f"N={model.n_obs}, R²={model.r_squared:.3f}")

        test1.append({
            'dv': dv, 'label': label,
            'Z1_beta': model.beta[0], 'Z1_se': model.se[0], 'Z1_p': model.pvalues[0],
            'Z2_beta': model.beta[1], 'Z2_p': model.pvalues[1],
            'n': model.n_obs, 'n_countries': est['iso3'].nunique(),
            'r2': model.r_squared,
        })
    results['test1'] = pd.DataFrame(test1)

    # ── Test 2: OECD vs non-OECD ──
    print("\n" + "=" * 60)
    print("TEST 2: OECD vs Non-OECD Split")
    print("=" * 60)

    test2 = []
    for subsample in ['OECD', 'Non-OECD']:
        oecd_col = 'oecd' if 'oecd' in panel.columns else ('is_oecd' if 'is_oecd' in panel.columns else None)
        if oecd_col:
            sub = panel[panel[oecd_col] == (1 if subsample == 'OECD' else 0)]
        else:
            continue

        for dv in ['bank_claims_out_gdp', 'bank_claims_in_gdp', 'net_bank_claims_gdp']:
            est = sub.dropna(subset=[dv, 'Z_1', 'Z_2']).copy()
            if len(est) < 30:
                continue

            model = PanelGLS()
            y = est[dv].values
            X = est[['Z_1', 'Z_2']].values
            model.fit(y, X, est['iso3'].values, est['year'].values)

            stars = '***' if model.pvalues[0] < 0.01 else ('**' if model.pvalues[0] < 0.05 else ('*' if model.pvalues[0] < 0.1 else ''))
            print(f"  {subsample} {dv}: Z₁={model.beta[0]:.4f}{stars} (p={model.pvalues[0]:.3f}), N={model.n_obs}")

            test2.append({
                'subsample': subsample, 'dv': dv,
                'Z1_beta': model.beta[0], 'Z1_se': model.se[0], 'Z1_p': model.pvalues[0],
                'n': model.n_obs, 'r2': model.r_squared,
            })
    results['test2'] = pd.DataFrame(test2)

    # ── Test 3: Z×KAOPEN interactions ──
    print("\n" + "=" * 60)
    print("TEST 3: Z₁×KAOPEN on Bank Lending")
    print("=" * 60)

    test3 = []
    if 'kaopen' in panel.columns:
        for dv in ['bank_claims_out_gdp', 'bank_claims_in_gdp', 'net_bank_claims_gdp']:
            est = panel.dropna(subset=[dv, 'Z_1', 'Z_2', 'kaopen']).copy()
            if len(est) < 50:
                continue

            est['Z1_x_kaopen'] = est['Z_1'] * est['kaopen']
            model = PanelGLS()
            y = est[dv].values
            X = est[['Z_1', 'Z_2', 'kaopen', 'Z1_x_kaopen']].values
            model.fit(y, X, est['iso3'].values, est['year'].values)

            stars_int = '***' if model.pvalues[3] < 0.01 else ('**' if model.pvalues[3] < 0.05 else ('*' if model.pvalues[3] < 0.1 else ''))
            print(f"  {dv}: Z₁×KA={model.beta[3]:.4f}{stars_int} (p={model.pvalues[3]:.3f}), N={model.n_obs}")

            test3.append({
                'dv': dv,
                'Z1_beta': model.beta[0], 'Z1_p': model.pvalues[0],
                'int_beta': model.beta[3], 'int_se': model.se[3], 'int_p': model.pvalues[3],
                'n': model.n_obs,
            })
    results['test3'] = pd.DataFrame(test3)

    # ── Test 4: Does adding bank lending change CA/income balance results? ──
    print("\n" + "=" * 60)
    print("TEST 4: Does Bank Lending Control Change Core Findings?")
    print("=" * 60)

    test4 = []
    core_dvs = ['ca_gdp', 'income_bal_gdp'] if 'income_bal_gdp' in panel.columns else ['ca_gdp']
    # Try alternative income balance column names
    for alt in ['income_balance_gdp', 'primary_income_gdp']:
        if alt in panel.columns:
            core_dvs.append(alt)

    for dv in core_dvs:
        if dv not in panel.columns:
            continue

        # Baseline: Z₁ + Z₂ only
        base_est = panel.dropna(subset=[dv, 'Z_1', 'Z_2']).copy()
        if len(base_est) < 50:
            continue
        m_base = PanelGLS()
        m_base.fit(base_est[dv].values, base_est[['Z_1', 'Z_2']].values,
                    base_est['iso3'].values, base_est['year'].values)

        # With bank lending control
        bank_col = 'net_bank_claims_gdp'
        ctrl_est = panel.dropna(subset=[dv, 'Z_1', 'Z_2', bank_col]).copy()
        if len(ctrl_est) < 50:
            continue

        # Same-sample baseline first!
        m_same = PanelGLS()
        m_same.fit(ctrl_est[dv].values, ctrl_est[['Z_1', 'Z_2']].values,
                    ctrl_est['iso3'].values, ctrl_est['year'].values)

        m_ctrl = PanelGLS()
        m_ctrl.fit(ctrl_est[dv].values, ctrl_est[['Z_1', 'Z_2', bank_col]].values,
                    ctrl_est['iso3'].values, ctrl_est['year'].values)

        attenuation = (m_ctrl.beta[0] - m_same.beta[0]) / m_same.beta[0] * 100 if abs(m_same.beta[0]) > 1e-10 else np.nan

        stars_b = '***' if m_base.pvalues[0] < 0.01 else ('**' if m_base.pvalues[0] < 0.05 else ('*' if m_base.pvalues[0] < 0.1 else ''))
        stars_s = '***' if m_same.pvalues[0] < 0.01 else ('**' if m_same.pvalues[0] < 0.05 else ('*' if m_same.pvalues[0] < 0.1 else ''))
        stars_c = '***' if m_ctrl.pvalues[0] < 0.01 else ('**' if m_ctrl.pvalues[0] < 0.05 else ('*' if m_ctrl.pvalues[0] < 0.1 else ''))

        print(f"  {dv}:")
        print(f"    Baseline (full):     Z₁={m_base.beta[0]:.2f}{stars_b} (p={m_base.pvalues[0]:.3f}), N={m_base.n_obs}")
        print(f"    Same-sample:         Z₁={m_same.beta[0]:.2f}{stars_s} (p={m_same.pvalues[0]:.3f}), N={m_same.n_obs}")
        print(f"    + bank lending ctrl: Z₁={m_ctrl.beta[0]:.2f}{stars_c} (p={m_ctrl.pvalues[0]:.3f}), N={m_ctrl.n_obs}")
        print(f"    Attenuation: {attenuation:+.0f}%" if pd.notna(attenuation) else "    Attenuation: —")

        test4.append({
            'dv': dv,
            'base_Z1': m_base.beta[0], 'base_p': m_base.pvalues[0], 'base_n': m_base.n_obs,
            'same_Z1': m_same.beta[0], 'same_p': m_same.pvalues[0], 'same_n': m_same.n_obs,
            'ctrl_Z1': m_ctrl.beta[0], 'ctrl_p': m_ctrl.pvalues[0], 'ctrl_n': m_ctrl.n_obs,
            'attenuation': attenuation,
        })
    results['test4'] = pd.DataFrame(test4)

    # ── Test 5: Bank lending vs IIP debt comparison ──
    print("\n" + "=" * 60)
    print("TEST 5: Bank Claims vs IIP Debt Assets — Coverage Comparison")
    print("=" * 60)

    for pos in ['bank_claims_out_gdp', 'debt_assets_gdp']:
        if pos not in panel.columns:
            continue
        sub = panel.dropna(subset=[pos])
        print(f"  {pos}: {sub['iso3'].nunique()} countries, {len(sub)} obs, "
              f"years {sub['year'].min()}-{sub['year'].max()}")
        print(f"    Mean={sub[pos].mean():.4f}, Median={sub[pos].median():.4f}")

    # Correlation between bank claims and IIP debt
    both = panel.dropna(subset=['bank_claims_out_gdp', 'debt_assets_gdp'])
    if len(both) > 0:
        corr = both[['bank_claims_out_gdp', 'debt_assets_gdp']].corr().iloc[0, 1]
        print(f"\n  Correlation(bank_claims_out, debt_assets): {corr:.3f}, N={len(both)}")

    return results


def write_output(results):
    """Write output table."""
    lines = []
    lines.append("# BIS Bank Lending and Demographics")
    lines.append("")
    lines.append("*Tests whether cross-border bank lending (BIS LBS) represents a missing channel")
    lines.append("in the net/gross paper's IIP-based analysis.*")
    lines.append("")

    stars = lambda p: '***' if p < 0.01 else ('**' if p < 0.05 else ('*' if p < 0.1 else ''))

    # Test 1
    if 'test1' in results and len(results['test1']) > 0:
        lines.append("## Test 1: Demographics → Bank Lending Positions")
        lines.append("")
        lines.append("| Variable | Z₁ | SE | p | Z₂ | p | N | Countries | R² |")
        lines.append("|:--|--:|--:|--:|--:|--:|--:|--:|--:|")
        for _, r in results['test1'].iterrows():
            s = stars(r['Z1_p'])
            s2 = stars(r['Z2_p'])
            lines.append(f"| {r['label']} | {r['Z1_beta']:.4f}{s} | {r['Z1_se']:.4f} | "
                        f"{r['Z1_p']:.3f} | {r['Z2_beta']:.4f}{s2} | {r['Z2_p']:.3f} | "
                        f"{r['n']} | {r['n_countries']} | {r['r2']:.3f} |")
        lines.append("")

    # Test 2
    if 'test2' in results and len(results['test2']) > 0:
        lines.append("## Test 2: OECD vs Non-OECD")
        lines.append("")
        lines.append("| Subsample | Variable | Z₁ | p | N | R² |")
        lines.append("|:--|:--|--:|--:|--:|--:|")
        for _, r in results['test2'].iterrows():
            s = stars(r['Z1_p'])
            dv_short = r['dv'].replace('bank_claims_', '').replace('_gdp', '')
            lines.append(f"| {r['subsample']} | {dv_short} | {r['Z1_beta']:.4f}{s} | "
                        f"{r['Z1_p']:.3f} | {r['n']} | {r['r2']:.3f} |")
        lines.append("")

    # Test 3
    if 'test3' in results and len(results['test3']) > 0:
        lines.append("## Test 3: Z₁×KAOPEN on Bank Lending")
        lines.append("")
        lines.append("| Variable | Z₁ | p | Z₁×KAOPEN | SE | p | N |")
        lines.append("|:--|--:|--:|--:|--:|--:|--:|")
        for _, r in results['test3'].iterrows():
            s1 = stars(r['Z1_p'])
            si = stars(r['int_p'])
            dv_short = r['dv'].replace('bank_claims_', '').replace('_gdp', '')
            lines.append(f"| {dv_short} | {r['Z1_beta']:.4f}{s1} | {r['Z1_p']:.3f} | "
                        f"{r['int_beta']:.4f}{si} | {r['int_se']:.4f} | {r['int_p']:.3f} | {r['n']} |")
        lines.append("")

    # Test 4
    if 'test4' in results and len(results['test4']) > 0:
        lines.append("## Test 4: Does Bank Lending Control Change Core Findings?")
        lines.append("")
        lines.append("*Same-sample comparison (toolkit item 5.8) to avoid composition artifacts.*")
        lines.append("")
        lines.append("| DV | Baseline Z₁ | p | Same-sample Z₁ | p | + Bank ctrl Z₁ | p | Attenuation | N |")
        lines.append("|:--|--:|--:|--:|--:|--:|--:|--:|--:|")
        for _, r in results['test4'].iterrows():
            sb = stars(r['base_p'])
            ss = stars(r['same_p'])
            sc = stars(r['ctrl_p'])
            att = f"{r['attenuation']:+.0f}%" if pd.notna(r['attenuation']) else "—"
            lines.append(f"| {r['dv']} | {r['base_Z1']:.2f}{sb} | {r['base_p']:.3f} | "
                        f"{r['same_Z1']:.2f}{ss} | {r['same_p']:.3f} | "
                        f"{r['ctrl_Z1']:.2f}{sc} | {r['ctrl_p']:.3f} | {att} | {r['ctrl_n']} |")
        lines.append("")

    lines.append("## Synthesis")
    lines.append("")
    lines.append("*Assessment of whether bank lending data changes net/gross paper findings.*")
    lines.append("")
    lines.append("---")
    lines.append(f"*Generated: March 8, 2026. Data: BIS LBS (1977-2025), net_gross_panel.csv.*")

    outpath = TABLES_DIR / "table_bis_bank_lending.md"
    outpath.write_text('\n'.join(lines))
    print(f"\nSaved: {outpath}")


def main():
    print("=" * 70)
    print("PHASE 10: BIS Bank Lending and Demographics")
    print("=" * 70)

    # ── Load BIS LBS ──
    bilateral = load_bis_lbs()
    if len(bilateral) == 0:
        print("  No BIS data loaded — aborting")
        return

    print(f"\n  Bilateral claims: {len(bilateral):,} obs")
    print(f"  Reporters: {bilateral['reporter_iso3'].nunique()}")
    print(f"  Counterparties: {bilateral['counterparty_iso3'].nunique()}")
    print(f"  Years: {bilateral['year'].min()}-{bilateral['year'].max()}")

    # ── Aggregate to country-year ──
    bank_agg = aggregate_to_country_year(bilateral)
    print(f"\n  Country-year aggregate: {len(bank_agg):,} obs, "
          f"{bank_agg['iso3'].nunique()} countries")

    # ── Merge with panel ──
    panel = merge_with_panel(bank_agg)
    bank_coverage = panel['bank_claims_out_gdp'].notna().sum()
    print(f"  Panel with bank data: {bank_coverage} obs with bank claims")

    # ── Run tests ──
    test_results = run_tests(panel)

    # ── Write output ──
    write_output(test_results)

    print("\n" + "=" * 70)
    print("DONE")
    print("=" * 70)


if __name__ == "__main__":
    main()
